library(lfe)
library(dplyr)
library(tidyr)

source('Code/Clean Data Functions/clean_video_wave_1.R')
source('Code/Clean Data Functions/clean_video_wave_2.R')
source('Code/Clean Data Functions/clean_text_wave_1.R')
source('Code/Clean Data Functions/clean_text_wave_2.R')

vid1 <- import_video_wave_1()
vid2 <- import_video_wave_2()

colnames(vid1) <- paste0(colnames(vid1),'w1')
colnames(vid2) <- paste0(colnames(vid2),'w2')

vid <- left_join(vid1, vid2, by=c('PIDw1'='PIDw2'))
vid$treatmentw1 <- str_extract(vid$treatmentw1, 'yes|no|placebo')
vid$treatmentw1 <- recode(vid$treatmentw1, 
                          'yes'= 'pro',
                          'no'='anti',
                          'placebo'='placebo')

vid <- vid[!duplicated(vid$PIDw1),]
vid$gap <- as.Date(vid$EndDatew2) - as.Date(vid$EndDatew1) 

vid_complete_long <- pivot_longer(vid[,c('scalew1', 'censorshipw1', 'privacyw1', 'congressw1', 'influencew1', 'fav_techw1', 
                                  'break_techw1', 'scalew2', 'censorshipw2', 'privacyw2', 'congressw2', 'influencew2', 'fav_techw2', 'break_techw2', 'PIDw1'),], 
                                  cols=c('scalew1', 'censorshipw1', 'privacyw1', 'congressw1', 'influencew1', 'fav_techw1', 
                                         'break_techw1', 'scalew2', 'censorshipw2', 'privacyw2', 'congressw2', 'influencew2', 'fav_techw2', 'break_techw2')) 

vid_complete_long$wave <- str_extract(vid_complete_long$name, 'w1|w2')
vid_complete_long$var <- str_extract(vid_complete_long$name, '\\w*(?=w1|w2)')
vid_complete_long <- pivot_wider(vid_complete_long, id_cols = c('PIDw1', 'wave'),
                        names_from='var', values_from = 'value')

vid_complete_long <- vid_complete_long[complete.cases(vid_complete_long),]
vid_complete_long$pc_both_waves_together <- prcomp(as.matrix(vid_complete_long[,3:ncol(vid_complete_long)]), retx=T)$x[,1]
if(cor(vid_complete_long$pc_both_waves_together, vid_complete_long$break_tech, use='pairwise.complete.obs') < 0){
  vid_complete_long$pc_both_waves_together <- -vid_complete_long$pc_both_waves_together
}
vid_complete_long$pc_both_waves_together <- vid_complete_long$pc_both_waves_together/sd(vid_complete_long$pc_both_waves_together)

wider <- pivot_wider(vid_complete_long, id_cols='PIDw1', values_from='pc_both_waves_together', names_from='wave')
colnames(wider) <- c('PIDw1', 'pc_both_waves_togetherw1', 'pc_both_waves_togetherw2')

vid <- left_join(vid, wider)
vid$complete_both_waves_break <- !is.na(vid$break_techw1) & !is.na(vid$break_techw2)
vid$complete_both_waves_pc <- !is.na(vid$pcw1) & !is.na(vid$pcw2)

vid_long <- pivot_longer(vid, cols=c('break_techw1', 'break_techw2', 'pcw1', 'pcw2')) 

vid_long$wave <- str_extract(vid_long$name, 'w1|w2')
vid_long$var <- str_extract(vid_long$name, 'break_tech|pc')
vid_long <- pivot_wider(vid_long, id_cols = c('PIDw1', 'wave', 'treatmentw1', 'agew1', 'manw1', 'pid7w1', 'ideow1',  'med_prefw1',
                                              'native.americanw1',  'asianw1', 'blackw1', 'hispanicw1', 'whitew1', 'middle.easternw1', 'complete_both_waves_pc', 'complete_both_waves_break'), 
                        names_from='var', values_from = 'value')

vid_long <- left_join(vid_long, vid_complete_long[,c('PIDw1', 'wave', 'pc_both_waves_together')])



summary(felm(pcw1~treatmentw1, vid,))
summary(felm(pcw2~treatmentw1, vid))

summary(felm(break_techw1~treatmentw1, vid))
summary(felm(break_techw2~treatmentw1, vid))

text1 <- import_text_wave_1()
text2 <- import_text_wave_2()

colnames(text1) <- paste0(colnames(text1),'w1')
colnames(text2) <- paste0(colnames(text2),'w2')
text <- left_join(text1, text2, by=c('PIDw1'='PIDw2'))
text$treatmentw1 <- str_extract(text$treatmentw1, 'pro|anti|control')
text$treatmentw1 <- recode(text$treatmentw1, 
                          'pro'= 'pro',
                          'anti'='anti',
                          'control'='placebo')

text <- text[!duplicated(text$PIDw1),]
text$gap <- as.Date(text$EndDatew2) - as.Date(text$EndDatew1) 


text_complete_long <- pivot_longer(text[,c('scalew1', 'censorshipw1', 'privacyw1', 'congressw1', 'influencew1', 'fav_techw1', 
                                         'break_techw1', 'scalew2', 'censorshipw2', 'privacyw2', 'congressw2', 'influencew2', 'fav_techw2', 'break_techw2', 'PIDw1'),], 
                                  cols=c('scalew1', 'censorshipw1', 'privacyw1', 'congressw1', 'influencew1', 'fav_techw1', 
                                         'break_techw1', 'scalew2', 'censorshipw2', 'privacyw2', 'congressw2', 'influencew2', 'fav_techw2', 'break_techw2')) 

text_complete_long$wave <- str_extract(text_complete_long$name, 'w1|w2')
text_complete_long$var <- str_extract(text_complete_long$name, '\\w*(?=w1|w2)')
text_complete_long <- pivot_wider(text_complete_long, id_cols = c('PIDw1', 'wave'),
                                 names_from='var', values_from = 'value')


text_complete_long <- text_complete_long[complete.cases(text_complete_long),]
text_complete_long$pc_both_waves_together <- prcomp(as.matrix(text_complete_long[,3:ncol(text_complete_long)]), retx=T)$x[,1]
if(cor(text_complete_long$pc_both_waves_together, text_complete_long$break_tech, use='pairwise.complete.obs') < 0){
  text_complete_long$pc_both_waves_together <- -text_complete_long$pc_both_waves_together
}
text_complete_long$pc_both_waves_together <- text_complete_long$pc_both_waves_together/sd(text_complete_long$pc_both_waves_together)

wider <- pivot_wider(text_complete_long, id_cols='PIDw1', values_from='pc_both_waves_together', names_from='wave')
colnames(wider) <- c('PIDw1', 'pc_both_waves_togetherw1', 'pc_both_waves_togetherw2')

text <- left_join(text, wider)


text$complete_both_waves_break <- !is.na(text$break_techw1) & !is.na(text$break_techw2)
text$complete_both_waves_pc <- !is.na(text$pcw1) & !is.na(text$pcw2)



text_long <- pivot_longer(text, cols=c('break_techw1', 'break_techw2', 'pcw1', 'pcw2')) 

text_long$wave <- str_extract(text_long$name, 'w1|w2')
text_long$var <- str_extract(text_long$name, 'break_tech|pc')
text_long <- pivot_wider(text_long, id_cols = c('PIDw1', 'wave', 'treatmentw1', 'agew1', 'manw1', 'pid7w1', 'ideow1',  'med_prefw1',
       'native.americanw1',  'asianw1', 'blackw1', 'hispanicw1', 'whitew1', 'middle.easternw1', 'complete_both_waves_pc', 'complete_both_waves_break'), 
                        names_from='var', values_from='value')

text_long <- left_join(text_long, text_complete_long[,c('PIDw1', 'wave', 'pc_both_waves_together')])


